# Breakthrough infections regression 
# MKD 
# May 21 2024 

## Packages 
if(!require(pacman)) install.packages("pacman")
library(pacman)
p_load(data.table, fixest, ggplot2, collapse, dplyr, modelsummary, flextable)
 
rm(list = ls()) # Clear environment

# Read in the analytic file
dt <- fread("covid_inpc_cohort/data/covid_tests_inpc_2018_2019_cohort.csv")

# Check for duplicates 
dt[duplicated(STUDY_ID), .N] # no duplicates

# Dummy variable -- Positive Covid Test in 2021
dt[, infect_2021 := fifelse((npos_test_2021_1 >=1 | npos_test_2021_2 >=1 | npos_test_2021_3 >=1 |npos_test_2021_4 >=1), 1,0)]

# Dummy variable -- Vaccinated in 2021? 
dt[, vax_2021 := fifelse((year(first_date)==2021 | year(second_date)==2021 | year(third_date)==2021 | year(fourth_date)==2021), 1, 0)]
dt[is.na(vax_2021), vax_2021 := 0]

# Dummy variable -- Vaccinated in 2022? 
dt[, vax_2022 := fifelse((year(first_date)==2022 | year(second_date)==2022 | year(third_date)==2022 | year(fourth_date)==2022), 1, 0)]
dt[is.na(vax_2022), vax_2022 := 0]

# Covariates  
dt[, white := fifelse(RACE == "WHITE", 1, 0)]
dt[, .N, by = "white"] # check white

dt[, black := fifelse(RACE == "BLACK OR AFRICAN AMERICAN", 1, 0)]
dt[, .N, by = "black"] # check black

dt[, other_race := fifelse((white == 0 & black == 0), 1, 0)]
dt[, .N, by = "other_race"] # check other race 

dt[, female := fifelse(GENDER == "F", 1, 0)]
dt[, .N, by = female] # check gender 

dt[, age := 2021 - year(DOB)]
dt[, .N, by = age][order(N)] # check age 

# Partition the data into young, mid, old 
  dt_young <- dt[age <=39]
  dt_med <- dt[age >= 40 & age<= 64]
  dt_old <- dt[age>= 65]

# Regressions on older group 
  reg_old_nocon <- feols(data = dt_old, 
                         vax_2022 ~ infect_2021 + vax_2021 + infect_2021*vax_2021, 
                         vcov = "hetero"
                         )

  reg_old_con_age_fe <- feols(data = dt_old, 
                       vax_2022 ~ infect_2021 + vax_2021 + infect_2021*vax_2021 + white + black + female + factor(age), 
                       vcov = "hetero"
                       )
  old_reg <- feols(data = dt_old, 
                   vax_2022 ~ infect_2021 + vax_2021 + infect_2021*vax_2021 + white + black+ female, 
                   vcov = "hetero")

  reg_old_con_age_lin <- feols(data = dt_old, 
                           vax_2022 ~ infect_2021 + vax_2021 + infect_2021*vax_2021 + white + black + female + age, 
                           vcov = "hetero"
                           )

# Regressions on middle age 
  reg_med_nocon <- feols(data = dt_med, 
                         vax_2022 ~ infect_2021 + vax_2021 + infect_2021*vax_2021, 
                         vcov = "hetero")

  reg_med_con_age_fe <- feols(data = dt_med, 
                       vax_2022 ~ infect_2021 + vax_2021 + infect_2021*vax_2021 + white + black + female + factor(age), 
                       vcov = "hetero")

  reg_med_con_age_lin <- feols(data = dt_med, 
                           vax_2022 ~ infect_2021 + vax_2021 + infect_2021*vax_2021 + white + black + female + age, 
                           vcov = "hetero")

# regressions on young
  reg_young_nocon <- feols(data = dt_young, 
                           vax_2022 ~ infect_2021 + vax_2021 + infect_2021*vax_2021,
                          vcov = "hetero")

  reg_young_con_age_fe <- feols(dt_young, 
                         vax_2022 ~ infect_2021 + vax_2021 + infect_2021*vax_2021 + white + black + female + factor(age), 
                         vcov = "hetero"
                         )

  reg_young_con_age_lin <- feols(dt_young, 
                             vax_2022 ~ infect_2021 + vax_2021 + infect_2021*vax_2021 + white + black + female + age, 
                             vcov = "hetero"
                             )

# Mean of outcomes 
dt_old_mean <- dt_old[, .(Mean_vax_2022_old = mean(vax_2022))]
dt_med_mean <- dt_med[, .(Mean_vax_2022_med = mean(vax_2022))]
dt_young_mean <- dt_young[, .(Mean_vax_2022_young = mean(vax_2022))]

# Combine means into a single data frame
means <- cbind(dt_old_mean, dt_med_mean, dt_young_mean)

# Create a row with the means for adding to the table
mean_row <- data.frame(
  Term = "Mean of Outcome",
  `Model 1` = means$Mean_vax_2022_young,
  `Model 2` = means$Mean_vax_2022_med,
  `Model 3` = means$Mean_vax_2022_old)

# Convert mean_row to the correct format for add_rows
mean_row <- data.table(mean_row)

# Make a list of the models with the age fixed effects
models_age_fe <- list("Age 18-39" = reg_young_con_age_fe, 
                      "Age 40-64" = reg_med_con_age_fe, 
                      "Age 65+" = reg_old_con_age_fe
                      )

# Create a map for renaming the goodness-of-fit statistics
gof_map <- data.frame(
    raw = c("nobs", "r.squared"),
    clean = c("N", "R2"),
    fmt = c(0, 2))

# Create a map for renaming the coefficients
coef_map <- c(
      "(Intercept)" = "Intercept",
      "infect_2021" = "2021 Infection",
      "vax_2021" = "2021 Vaccine",
      "infect_2021:vax_2021" = "2021 Infection x 2021 Vaccine")

# Export the regressions
modelsummary(models_age_fe,
              stars = TRUE, 
              output = "covid_inpc_cohort/results/reg_age_fe.docx", 
              gof_map = gof_map, 
              coef_map = coef_map, 
              vcov = "robust", 
              title = "Breakthrough Regressions",
              add_rows = mean_row, 
              add_rows_label = "Mean of Outcome",
              add_rows_position = "above"
              )